###############################################################################   
#### Replication Materials                                                 #### 
#### Taegyoon Kim, 2022. Violent Political Rhetoric on Twitter.            ####
#### Political Science Research and Methods                                ####
###############################################################################  



#################################### Set Up ###################################


## packages

import pandas as pd
import numpy as np
import random, os

from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from sklearn.metrics import confusion_matrix


## path

path_data = 'kim_psrm_replication/data/' 
path_output = 'kim_psrm_replication/output/' 



################################### Table A4 ##################################

# run lines from 35 to 79 all at once
    
## performance report

def report_results(A, B):
    
    df = pd.DataFrame({'A': A,
                       'B': B})
    df = df.dropna()
    A = df['A']
    B = df['B']
    
    prec = precision_score(B, A)
    rec = recall_score(B, A)
    f1 = f1_score(B, A)
    
    performance = [prec, rec, f1]

    return performance


## lr/rf/xgb * count/tfidf/glove + BERT

df_cv_list = []
for file in os.listdir(path_data):
    if file.startswith('df_cv'):
        df_cv_list.append(file)


for file in df_cv_list:     

    print(file)
    df_cv = pd.read_csv(path_data + file)
    performance_all = []
    for i in range(1, 6):
        performance = report_results(
            df_cv[df_cv['fold']== i]['predicted'], 
            df_cv[df_cv['fold']== i]['true'])
        performance_all.append(performance)

    tbl = pd.DataFrame(performance_all, columns = ['Precision', 'Recall', 'F-1']).transpose()
    tbl.columns = ['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5']
    tbl['Mean'] = tbl.transpose().mean()
    tbl = round(tbl * 100, 2)
    print(tbl, '\n')
    
    file_name = file[6:]
    file_name = file_name[:len(file_name) - 4]
    tbl['Mean'].to_csv(path_output + 'tbla4_' + file_name + '.csv')



################################### Table A5 ##################################
    
# run lines from 89 to 146 all at once

## performance report

def report_results(A, B):

    A_name = A.name
    B_name = B.name
    
    df = pd.DataFrame({'A': A,
                       'B': B})
    df = df.dropna()
    A = df['A']
    B = df['B']
    
    prec = precision_score(B, A)
    rec = recall_score(B, A)
    f1 = f1_score(B, A)
    acc = accuracy_score(B, A)

    performance = [prec, rec, f1, acc]

    return performance


## bert

df_cv_bert = pd.read_csv(path_data + 'df_cv_bert.csv')

performance_all = []
confusion_all = []

for i in range(1, 6):
  performance = report_results(
    df_cv_bert[df_cv_bert['fold']== i]['predicted'], 
    df_cv_bert[df_cv_bert['fold']== i]['true'])
  confusion = confusion_matrix(
      df_cv_bert[df_cv_bert['fold']== i]['true'],
      df_cv_bert[df_cv_bert['fold']== i]['predicted'])
  performance_all.append(performance)
  confusion_all.append(confusion)


tbla5_performance = pd.DataFrame(performance_all, columns = ['Precision', 'Recall', 'F-1', 'Accuracy']).transpose()
tbla5_performance.columns = ['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5']
tbla5_performance['Mean'] = tbla5_performance.transpose().mean()
tbla5_performance = round(tbla5_performance * 100, 2)

tbla5_confusion = pd.DataFrame([[item for sublist in confusion_all[0] for item in sublist], 
                                [item for sublist in confusion_all[1] for item in sublist], 
                                [item for sublist in confusion_all[2] for item in sublist],
                                [item for sublist in confusion_all[3] for item in sublist],
                                [item for sublist in confusion_all[4] for item in sublist]])
tbla5_confusion.columns = ['True Negative', 'False Positive', 'False Negative', 'True Positive']
tbla5_confusion = tbla5_confusion.transpose()
tbla5_confusion.columns = ['Fold 1', 'Fold 2', 'Fold 3', 'Fold 4', 'Fold 5']
tbla5_confusion['Mean'] = tbla5_confusion.transpose().mean()
tbla5_confusion = round(tbla5_confusion, 1)

tbla5 = pd.concat([tbla5_performance, tbla5_confusion])

tbla5.to_csv(path_output + 'tbla5.csv')